import pandas as pd
pd.set_option("display.max_columns", 10)
# We use the KDD CUP 1999 data (https://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html)
# 41 column names can be found at https://kdd.ics.uci.edu/databases/kddcup99/kddcup.names
colnames = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files',
'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
'dst_host_srv_rerror_rate']
# We take 10% of the original data which can be found at
# http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz
# We select the first 100K records from this data
df = pd.read_csv("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz",names=colnames+["threat_type"])[:100000]
df.head(3)
| duration | protocol_type | service | flag | src_bytes | ... | dst_host_serror_rate | dst_host_srv_serror_rate | dst_host_rerror_rate | dst_host_srv_rerror_rate | threat_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | tcp | http | SF | 181 | ... | 0.0 | 0.0 | 0.0 | 0.0 | normal. |
| 1 | 0 | tcp | http | SF | 239 | ... | 0.0 | 0.0 | 0.0 | 0.0 | normal. |
| 2 | 0 | tcp | http | SF | 235 | ... | 0.0 | 0.0 | 0.0 | 0.0 | normal. |
3 rows × 42 columns
pip install plotly
Collecting plotly
Downloading plotly-5.14.1-py2.py3-none-any.whl (15.3 MB)
---------------------------------------- 0.0/15.3 MB ? eta -:--:--
---------------------------------------- 0.1/15.3 MB 1.7 MB/s eta 0:00:10
--------------------------------------- 0.2/15.3 MB 3.0 MB/s eta 0:00:06
- -------------------------------------- 0.5/15.3 MB 4.1 MB/s eta 0:00:04
-- ------------------------------------- 0.8/15.3 MB 4.9 MB/s eta 0:00:03
--- ------------------------------------ 1.5/15.3 MB 6.6 MB/s eta 0:00:03
----- ---------------------------------- 2.2/15.3 MB 8.3 MB/s eta 0:00:02
------- -------------------------------- 2.9/15.3 MB 9.1 MB/s eta 0:00:02
--------- ------------------------------ 3.6/15.3 MB 10.0 MB/s eta 0:00:02
----------- ---------------------------- 4.4/15.3 MB 10.8 MB/s eta 0:00:02
------------- -------------------------- 5.2/15.3 MB 11.5 MB/s eta 0:00:01
---------------- ----------------------- 6.1/15.3 MB 12.3 MB/s eta 0:00:01
----------------- ---------------------- 6.9/15.3 MB 12.6 MB/s eta 0:00:01
------------------- -------------------- 7.6/15.3 MB 13.1 MB/s eta 0:00:01
--------------------- ------------------ 8.3/15.3 MB 13.2 MB/s eta 0:00:01
----------------------- ---------------- 9.2/15.3 MB 13.6 MB/s eta 0:00:01
------------------------- -------------- 9.9/15.3 MB 13.8 MB/s eta 0:00:01
--------------------------- ----------- 10.8/15.3 MB 16.0 MB/s eta 0:00:01
----------------------------- --------- 11.6/15.3 MB 16.8 MB/s eta 0:00:01
------------------------------- ------- 12.4/15.3 MB 17.2 MB/s eta 0:00:01
--------------------------------- ----- 13.3/15.3 MB 17.7 MB/s eta 0:00:01
----------------------------------- --- 14.1/15.3 MB 17.7 MB/s eta 0:00:01
------------------------------------- - 14.9/15.3 MB 17.7 MB/s eta 0:00:01
-------------------------------------- 15.3/15.3 MB 17.2 MB/s eta 0:00:01
--------------------------------------- 15.3/15.3 MB 16.4 MB/s eta 0:00:00
Requirement already satisfied: packaging in c:\users\mangl\anaconda3\envs\env1_ids\lib\site-packages (from plotly) (23.0)
Collecting tenacity>=6.2.0
Downloading tenacity-8.2.2-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.14.1 tenacity-8.2.2
Note: you may need to restart the kernel to use updated packages.
import plotly.graph_objects as go
from collections import Counter
threat_count_dict = Counter(df["threat_type"])
threat_types = list(threat_count_dict.keys())
threat_counts = [threat_count_dict[threat_type] for threat_type in threat_types]
print("Total distinct number of threat types : ",len(threat_types))
fig = go.Figure([go.Bar(x=threat_types, y=threat_counts,text=threat_counts,textposition='auto')])
fig.show()
Total distinct number of threat types : 20
# 34 numerical columns are considered for training
numerical_colmanes = ['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot',
'num_failed_logins', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'count',
'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']
numerical_df = df[numerical_colmanes].copy()
# Lets remove the numerical columns with constant value
numerical_df = numerical_df.loc[:, (numerical_df != numerical_df.iloc[0]).any()]
# lets scale the values for each column from [0,1]
# N.B. we dont have any negative values]
final_df = numerical_df/numerical_df.max()
X = final_df.values
# final dataframe has 33 features
print("Shape of feature matrix : ",X.shape)
Shape of feature matrix : (100000, 33)
from sklearn.preprocessing import LabelEncoder
threat_types = df["threat_type"].values
encoder = LabelEncoder()
# use LabelEncoder to encode the threat types in numeric values
y = encoder.fit_transform(threat_types)
print("Shape of target vector : ",y.shape)
Shape of target vector : (100000,)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.4, random_state=42, stratify=y)
print("Number of records in training data : ", X_train.shape[0])
print("Number of records in test data : ", X_test.shape[0])
print("Total distinct number of threat types in training data : ",len(set(y_train)))
print("Total distinct number of threat types in test data : ",len(set(y_test)))
Number of records in training data : 60000 Number of records in test data : 40000 Total distinct number of threat types in training data : 20 Total distinct number of threat types in test data : 20
!pip install syft==0.2.3a1
%%capture
import torch
import syft as sy
# Hook PyTorch ie add extra functionalities to support Federated Learning
hook = sy.TorchHook(torch)
# Sets the seed for generating random numbers.
torch.manual_seed(1)
# Select CPU computation, in case you want GPU use "cuda" instead
device = torch.device("cpu")
# Data will be distributed among these VirtualWorkers.
# Remote training of the model will happen here.
gatway1 = sy.VirtualWorker(hook, id="gatway1")
gatway2 = sy.VirtualWorker(hook, id="gatway2")
import numpy as np
# Number of times we want to iterate over whole training data
BATCH_SIZE = 1000
EPOCHS = 6
LOG_INTERVAL = 5
lr = 0.01
n_feature = X_train.shape[1]
n_class = np.unique(y_train).shape[0]
print("Number of training features : ",n_feature)
print("Number of training classes : ",n_class)
Number of training features : 33 Number of training classes : 20
# Create pytorch tensor from X_train,y_train,X_test,y_test
train_inputs = torch.tensor(X_train,dtype=torch.float).tag("#iot", "#network","#data","#train")
train_labels = torch.tensor(y_train).tag("#iot", "#network","#target","#train")
test_inputs = torch.tensor(X_test,dtype=torch.float).tag("#iot", "#network","#data","#test")
test_labels = torch.tensor(y_test).tag("#iot", "#network","#target","#test")
# Send the training and test data to the gatways in equal proportion.
train_idx = int(len(train_labels)/2)
test_idx = int(len(test_labels)/2)
gatway1_train_dataset = sy.BaseDataset(train_inputs[:train_idx], train_labels[:train_idx]).send(gatway1)
gatway2_train_dataset = sy.BaseDataset(train_inputs[train_idx:], train_labels[train_idx:]).send(gatway2)
gatway1_test_dataset = sy.BaseDataset(test_inputs[:test_idx], test_labels[:test_idx]).send(gatway1)
gatway2_test_dataset = sy.BaseDataset(test_inputs[test_idx:], test_labels[test_idx:]).send(gatway2)
# Create federated datasets, an extension of Pytorch TensorDataset class
federated_train_dataset = sy.FederatedDataset([gatway1_train_dataset, gatway2_train_dataset])
federated_test_dataset = sy.FederatedDataset([gatway1_test_dataset, gatway2_test_dataset])
# Create federated dataloaders, an extension of Pytorch DataLoader class
federated_train_loader = sy.FederatedDataLoader(federated_train_dataset, shuffle=True, batch_size=BATCH_SIZE)
federated_test_loader = sy.FederatedDataLoader(federated_test_dataset, shuffle=False, batch_size=BATCH_SIZE)
import torch.nn as nn
class Net(nn.Module):
def __init__(self, input_dim, output_dim):
"""
input_dim: number of input features.
output_dim: number of labels.
"""
super(Net, self).__init__()
self.linear = torch.nn.Linear(input_dim, output_dim)
def forward(self, x):
outputs = self.linear(x)
return outputs
import torch.nn.functional as F
def train(model, device, federated_train_loader, optimizer, epoch):
model.train()
# Iterate through each gateway's dataset
for idx, (data, target) in enumerate(federated_train_loader):
batch_idx = idx+1
# Send the model to the right gateway
model.send(data.location)
# Move the data and target labels to the device (cpu/gpu) for computation
data, target = data.to(device), target.to(device)
# Clear previous gradients (if they exist)
optimizer.zero_grad()
# Make a prediction
output = model(data)
# Calculate the cross entropy loss [We are doing classification]
loss = F.cross_entropy(output, target)
# Calculate the gradients
loss.backward()
# Update the model weights
optimizer.step()
# Get the model back from the gateway
model.get()
if batch_idx==len(federated_train_loader) or (batch_idx!=0 and batch_idx % LOG_INTERVAL == 0):
# get the loss back
loss = loss.get()
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * BATCH_SIZE, len(federated_train_loader) * BATCH_SIZE,
100. * batch_idx / len(federated_train_loader), loss.item()))
import torch.nn.functional as F
def test(model, device, federated_test_loader):
model.eval()
correct = 0
with torch.no_grad():
for batch_idx, (data, target) in enumerate(federated_test_loader):
# Send the model to the right gateway
model.send(data.location)
# Move the data and target labels to the device (cpu/gpu) for computation
data, target = data.to(device), target.to(device)
# Make a prediction
output = model(data)
# Get the model back from the gateway
model.get()
# Calculate the cross entropy loss
loss = F.cross_entropy(output, target)
# Get the index of the max log-probability
pred = output.argmax(1, keepdim=True)
# Get the number of instances correctly predicted
correct += pred.eq(target.view_as(pred)).sum().get()
# get the loss back
loss = loss.get()
print('Test set: Loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
loss.item(), correct, len(federated_test_loader.federated_dataset),
100. * correct / len(federated_test_loader.federated_dataset)))
%%time
import torch.optim as optim
# Initialize the model
model = Net(n_feature,n_class)
#Initialize the SGD optimizer
optimizer = optim.SGD(model.parameters(), lr=lr)
for epoch in range(1, EPOCHS + 1):
# Train on the training data in a federated way
train(model, device, federated_train_loader, optimizer, epoch)
# Check the test accuracy on unseen test data in a federated way
test(model, device, federated_test_loader)
Train Epoch: 1 [5000/60000 (8%)] Loss: 3.063372 Train Epoch: 1 [10000/60000 (17%)] Loss: 2.978291 Train Epoch: 1 [15000/60000 (25%)] Loss: 2.902188 Train Epoch: 1 [20000/60000 (33%)] Loss: 2.816142 Train Epoch: 1 [25000/60000 (42%)] Loss: 2.744145 Train Epoch: 1 [30000/60000 (50%)] Loss: 2.669981 Train Epoch: 1 [35000/60000 (58%)] Loss: 2.592572 Train Epoch: 1 [40000/60000 (67%)] Loss: 2.530834 Train Epoch: 1 [45000/60000 (75%)] Loss: 2.460329 Train Epoch: 1 [50000/60000 (83%)] Loss: 2.393310 Train Epoch: 1 [55000/60000 (92%)] Loss: 2.313784 Train Epoch: 1 [60000/60000 (100%)] Loss: 2.245094 Test set: Loss: 2.2336, Accuracy: 22771/40000 (57%) Train Epoch: 2 [5000/60000 (8%)] Loss: 2.184924 Train Epoch: 2 [10000/60000 (17%)] Loss: 2.110788 Train Epoch: 2 [15000/60000 (25%)] Loss: 2.069961 Train Epoch: 2 [20000/60000 (33%)] Loss: 2.005506 Train Epoch: 2 [25000/60000 (42%)] Loss: 1.954772 Train Epoch: 2 [30000/60000 (50%)] Loss: 1.897917 Train Epoch: 2 [35000/60000 (58%)] Loss: 1.840582 Train Epoch: 2 [40000/60000 (67%)] Loss: 1.812792 Train Epoch: 2 [45000/60000 (75%)] Loss: 1.766029 Train Epoch: 2 [50000/60000 (83%)] Loss: 1.719928 Train Epoch: 2 [55000/60000 (92%)] Loss: 1.664493 Train Epoch: 2 [60000/60000 (100%)] Loss: 1.646868 Test set: Loss: 1.6117, Accuracy: 30686/40000 (77%) Train Epoch: 3 [5000/60000 (8%)] Loss: 1.570055 Train Epoch: 3 [10000/60000 (17%)] Loss: 1.545749 Train Epoch: 3 [15000/60000 (25%)] Loss: 1.495446 Train Epoch: 3 [20000/60000 (33%)] Loss: 1.453253 Train Epoch: 3 [25000/60000 (42%)] Loss: 1.438428 Train Epoch: 3 [30000/60000 (50%)] Loss: 1.387116 Train Epoch: 3 [35000/60000 (58%)] Loss: 1.358040 Train Epoch: 3 [40000/60000 (67%)] Loss: 1.356826 Train Epoch: 3 [45000/60000 (75%)] Loss: 1.335830 Train Epoch: 3 [50000/60000 (83%)] Loss: 1.289753 Train Epoch: 3 [55000/60000 (92%)] Loss: 1.251103 Train Epoch: 3 [60000/60000 (100%)] Loss: 1.243339 Test set: Loss: 1.2259, Accuracy: 30686/40000 (77%) Train Epoch: 4 [5000/60000 (8%)] Loss: 1.222930 Train Epoch: 4 [10000/60000 (17%)] Loss: 1.171059 Train Epoch: 4 [15000/60000 (25%)] Loss: 1.148782 Train Epoch: 4 [20000/60000 (33%)] Loss: 1.152593 Train Epoch: 4 [25000/60000 (42%)] Loss: 1.156718 Train Epoch: 4 [30000/60000 (50%)] Loss: 1.089015 Train Epoch: 4 [35000/60000 (58%)] Loss: 1.098877 Train Epoch: 4 [40000/60000 (67%)] Loss: 1.051879 Train Epoch: 4 [45000/60000 (75%)] Loss: 1.010052 Train Epoch: 4 [50000/60000 (83%)] Loss: 1.013699 Train Epoch: 4 [55000/60000 (92%)] Loss: 1.003037 Train Epoch: 4 [60000/60000 (100%)] Loss: 1.006362 Test set: Loss: 0.9781, Accuracy: 30687/40000 (77%) Train Epoch: 5 [5000/60000 (8%)] Loss: 0.984492 Train Epoch: 5 [10000/60000 (17%)] Loss: 0.958900 Train Epoch: 5 [15000/60000 (25%)] Loss: 0.962639 Train Epoch: 5 [20000/60000 (33%)] Loss: 0.913012 Train Epoch: 5 [25000/60000 (42%)] Loss: 0.925821 Train Epoch: 5 [30000/60000 (50%)] Loss: 0.915670 Train Epoch: 5 [35000/60000 (58%)] Loss: 0.891610 Train Epoch: 5 [40000/60000 (67%)] Loss: 0.885203 Train Epoch: 5 [45000/60000 (75%)] Loss: 0.849125 Train Epoch: 5 [50000/60000 (83%)] Loss: 0.863243 Train Epoch: 5 [55000/60000 (92%)] Loss: 0.833856 Train Epoch: 5 [60000/60000 (100%)] Loss: 0.863984 Test set: Loss: 0.8101, Accuracy: 30695/40000 (77%) Train Epoch: 6 [5000/60000 (8%)] Loss: 0.868828 Train Epoch: 6 [10000/60000 (17%)] Loss: 0.828386 Train Epoch: 6 [15000/60000 (25%)] Loss: 0.804441 Train Epoch: 6 [20000/60000 (33%)] Loss: 0.767164 Train Epoch: 6 [25000/60000 (42%)] Loss: 0.756120 Train Epoch: 6 [30000/60000 (50%)] Loss: 0.766257 Train Epoch: 6 [35000/60000 (58%)] Loss: 0.798183 Train Epoch: 6 [40000/60000 (67%)] Loss: 0.782674 Train Epoch: 6 [45000/60000 (75%)] Loss: 0.756474 Train Epoch: 6 [50000/60000 (83%)] Loss: 0.692317 Train Epoch: 6 [55000/60000 (92%)] Loss: 0.704085 Train Epoch: 6 [60000/60000 (100%)] Loss: 0.719235 Test set: Loss: 0.6928, Accuracy: 38287/40000 (96%) CPU times: user 5hour, sys: 22.2 s, total: 5hour 22s Wall time: 5hour 20s
# Save the model
torch.save(model.state_dict(), "binaize-threat-model.pt")
# Reload the model in a new model object
model_new = Net(n_feature,n_class)
model_new.load_state_dict(torch.load("binaize-threat-model.pt"))
model_new.eval()
Net( (linear): Linear(in_features=33, out_features=20, bias=True) )
idx = 122
data = test_inputs[idx]
pred = model_new(data)
pred_label = int(pred.argmax().data.cpu().numpy())
pred_threat = encoder.inverse_transform([pred_label])[0]
print("Predicted threat type : ", pred_threat)
actual_label = int(test_labels[idx].data.cpu().numpy())
actual_threat = encoder.inverse_transform([actual_label])[0]
print("Actual threat type : ", actual_threat)
Predicted threat type : neptune. Actual threat type : neptune.
idx = 159
data = test_inputs[idx]
pred = model_new(data)
pred_label = int(pred.argmax().data.cpu().numpy())
pred_threat = encoder.inverse_transform([pred_label])[0]
print("Predicted threat type : ", pred_threat)
actual_label = int(test_labels[idx].data.cpu().numpy())
actual_threat = encoder.inverse_transform([actual_label])[0]
print("Actual threat type : ", actual_threat)
Predicted threat type : normal. Actual threat type : ipsweep.
idx = 100
data = test_inputs[idx]
pred = model_new(data)
pred_label = int(pred.argmax().data.cpu().numpy())
pred_threat = encoder.inverse_transform([pred_label])[0]
print("Predicted threat type : ", pred_threat)
actual_label = int(test_labels[idx].data.cpu().numpy())
actual_threat = encoder.inverse_transform([actual_label])[0]
print("Actual threat type : ", actual_threat)
Predicted threat type : normal. Actual threat type : normal.
idx = 56
data = test_inputs[idx]
pred = model_new(data)
pred_label = int(pred.argmax().data.cpu().numpy())
pred_threat = encoder.inverse_transform([pred_label])[0]
print("Predicted threat type : ", pred_threat)
actual_label = int(test_labels[idx].data.cpu().numpy())
actual_threat = encoder.inverse_transform([actual_label])[0]
print("Actual threat type : ", actual_threat)
Predicted threat type : neptune. Actual threat type : neptune.
idx = 88
data = test_inputs[idx]
pred = model_new(data)
pred_label = int(pred.argmax().data.cpu().numpy())
pred_threat = encoder.inverse_transform([pred_label])[0]
print("Predicted threat type : ", pred_threat)
actual_label = int(test_labels[idx].data.cpu().numpy())
actual_threat = encoder.inverse_transform([actual_label])[0]
print("Actual threat type : ", actual_threat)
Predicted threat type : normal. Actual threat type : smurf.